from loguru import logger
from torchtext.data.utils import get_tokenizer

from config.config import cfg
from utils.utils import build_vocab
from utils.utils import log_init

if __name__ == '__main__':
    log_init('test_buildvocab', log_dir=cfg.log_dir)
    tokenizer = get_tokenizer("basic_english")
    vocab = build_vocab(cfg.dataset_dir, 'train', tokenizer)
    idx2word = [(k, v) for k, v in vocab.get_stoi().items()]
    logger.debug(f'vocab = {idx2word[:30]}')
